
# pip install lxml

from lxml import etree

import requests
import re

'''
    使用正则解析百度新闻网页
'''

headers = {
    "User-Agent":"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36"
}

url = 'http://news.baidu.com/'
data = requests.get(url, headers=headers).content.decode('utf-8')

# 1.转解析类型
xpath_data = etree.HTML(data)
# 2.调用xpath的方法
#  2.1节点操作
result = xpath_data.xpath('/html/head/title/text()')
# 2.2跨节点
result_1 = xpath_data.xpath('//a/text()')
# 2.3精确标签  : //a[@属性="属性值"]
result_2 = xpath_data.xpath('//a[@mon="r=1"]/text()')
# 2.4属性:@href  取出href属性   xpath() 返回来的数据类型是list 列表
result_3 = xpath_data.xpath('//a[@mon="r=1"]/@href')

#  li标签下面的a标签才有

'''
    跨节点的标签  是不能用[] 的,因为父节点不一样
'''

result_4 = xpath_data.xpath('//li/a/text()')


# print(result_3[0])
print(result_4)

with open('baidu_news.html', 'w', encoding='utf-8') as f:
    f.write(data)




